Any time you have issues, your first reference should be the ggplot2 help page at http://docs.ggplot2.org/current/.

Building blocks of a ggplot graph:

#install.packages('ISLR')
require(ISLR)
require(ggplot2)
require(reshape2)

data(Carseats)
head(Carseats)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1  9.50       138     73          11        276   120       Bad  42
## 2 11.22       111     48          16        260    83      Good  65
## 3 10.06       113     35          10        269    80    Medium  59
## 4  7.40       117    100           4        466    97    Medium  55
## 5  4.15       141     64           3        340   128       Bad  38
## 6 10.81       124    113          13        501    72       Bad  78
##   Education Urban  US
## 1        17   Yes Yes
## 2        10   Yes Yes
## 3        12   Yes Yes
## 4        14   Yes Yes
## 5        13   Yes  No
## 6        16    No Yes
data(EuStockMarkets)
stock_data = as.data.frame(EuStockMarkets)
stock_data$time = as.numeric(time(EuStockMarkets))
head(stock_data)
##       DAX    SMI    CAC   FTSE     time
## 1 1628.75 1678.1 1772.8 2443.6 1991.496
## 2 1613.63 1688.5 1750.5 2460.2 1991.500
## 3 1606.51 1678.6 1718.0 2448.2 1991.504
## 4 1621.04 1684.1 1708.1 2470.4 1991.508
## 5 1618.16 1686.6 1723.1 2484.7 1991.512
## 6 1610.61 1671.6 1714.3 2466.8 1991.515

Aesthetic mapping

How do we map variables to features of the plot. Common examples:

plot1 = ggplot(data=Carseats, aes(x=Price, y=Sales, color=ShelveLoc, shape=Urban)) +
    geom_point()
plot1

ggplot(data=Carseats, aes(x=Price, y=Sales, color=Advertising)) +
    geom_point() +
    xlab('Prices ($)') + ylab('Sales (thousands)') + ggtitle('Car Seat Sales')

Note that aesthetic mappings are inherited from the default aesthetic but can also be applied for specific pieces.

ggplot(data=Carseats, aes(x=Price, y=Sales)) +
    geom_point(aes(color=US))

Similarly, data can also be applied for a specific object, but otherwise is inherited from the top-level.

ggplot(data=Carseats, aes(x=Price, y=Sales)) +
    geom_point(data=subset(Carseats, US == 'Yes'), aes(color=Urban)) +
    geom_point(data=subset(Carseats, US == 'No'))

Geometric objects

What marks do we want on the plot. Each geom_ object tends to have certain aes arguments that it requires or can use. Examples:

When using multiple geometries, you can direcly modify their attributes or add them to the aesthetic (so they get included in legends).

Carseats_addpred = Carseats
Carseats_addpred$Sales_pred = predict(lm(Sales ~ Price, data=Carseats))

ggplot(data=Carseats_addpred, aes(x=Price)) +
    geom_point(aes(y=Sales)) +
    geom_line(aes(y=Sales_pred), color='Magenta')

ggplot(data=Carseats_addpred, aes(x=Price)) +
    geom_point(aes(y=Sales, color='Data')) +
    geom_line(aes(y=Sales_pred, color='Predicted'))

Some miscellaneous other geom_ examples.

ggplot(data=stock_data[1:100,], aes(x=time, y=DAX)) +
    geom_point() +
    geom_smooth(span=0.3)

ggplot(data=Carseats, aes(x=Price, y=Sales)) +
    geom_point() +
    geom_smooth(method='lm', se=FALSE)

ggplot(data=Carseats, aes(x=Sales, color=ShelveLoc, fill=ShelveLoc)) +
    geom_density(alpha=0.3)

ggplot(data=Carseats, aes(x=Price, y=Sales)) +
    geom_point() +
    geom_abline(intercept=10, slope=-0.04) +
    geom_vline(xintercept=115, linetype='dashed')

require(mvtnorm)
dat = Carseats[,c('Price', 'Sales')]
xgrid = expand.grid(
    Price=seq(24, 191, length.out=50),
    Sales=seq(0, 16.3, length.out=50))
xgrid$p = dmvnorm(xgrid, mean=colMeans(dat), sigma=cov(dat))

ggplot(Carseats, aes(x=Price, y=Sales)) +
    geom_point() +
    geom_contour(aes(z=p), data=xgrid, breaks=3e-4)

ggplot(Carseats, aes(x=Price, y=Sales)) +
    geom_point() +
    stat_ellipse()

Aside on melt

You’ll very often find that you need data in long format to plot it using ggplot, which means you will want to use melt from the reshape2 package. To change data in the other direction, from long to wide, you can use dcast.

head(stock_data)
##       DAX    SMI    CAC   FTSE     time
## 1 1628.75 1678.1 1772.8 2443.6 1991.496
## 2 1613.63 1688.5 1750.5 2460.2 1991.500
## 3 1606.51 1678.6 1718.0 2448.2 1991.504
## 4 1621.04 1684.1 1708.1 2470.4 1991.508
## 5 1618.16 1686.6 1723.1 2484.7 1991.512
## 6 1610.61 1671.6 1714.3 2466.8 1991.515
stock_data_melted = melt(stock_data, id.vars='time')
head(stock_data_melted)
##       time variable   value
## 1 1991.496      DAX 1628.75
## 2 1991.500      DAX 1613.63
## 3 1991.504      DAX 1606.51
## 4 1991.508      DAX 1621.04
## 5 1991.512      DAX 1618.16
## 6 1991.515      DAX 1610.61
ggplot(data=stock_data_melted, aes(x=time, y=value, color=variable)) +
    geom_line()

Scales and Legends

plot2 = ggplot(data=Carseats, aes(x=Price, y=Sales, color=Advertising, shape=ShelveLoc)) +
    geom_point()

plot2 + scale_shape_discrete(name='Shelve Location',
                             breaks=c('Good', 'Medium', 'Bad'),
                             labels=c('G', 'M', 'B'))

plot2 + scale_color_continuous(name='Advertising Level',
                               breaks=seq(0, 30, 5),
                               low='grey', high='red')

require(RColorBrewer)
plot2 + scale_color_distiller(palette='YlOrRd', direction=1)

plot2 + scale_x_continuous(limits=c(0, 300))

plot2 + scale_y_log10()

plot2 + scale_y_reverse()

plot2 + coord_fixed(ratio=5)

Faceting and visuals

plot2 + facet_grid(. ~ Urban)

plot2 + facet_grid(US ~ Urban, labeller=label_both)

ggplot(data=stock_data_melted, aes(x=time, y=value)) +
    geom_line() +
    facet_wrap(~variable)

Themes

plot2 + theme_bw()

plot2 + theme_minimal()

plot2 + theme(legend.position=c(0.98,0.98), legend.justification=c(1,1),
              legend.box = "horizontal",
              legend.background=element_rect(color="lightgrey"))

Exercises

  1. Plot the centers of each US state by their latitude and longitude. Label the states with their two-letter abbreviation. Size the dots according to each state’s population. Color the states according to their regional division. Note that you can load this data using data(state), after which state data will be located in:

    • state.center
    • state.abb
    • as.data.frame(state.x77)$Population
    • state.division
  2. dowjones.csv contains weekly percentage changes for a set of 30 stocks. Attempt to replicate the following figure, which visualizes the correlation matrix of the stocks.

    Hints:

    • You might want to look up geom_tile.
    • Try using melt on the correlation matrix.
    • If you want a “prettier” ordering of variables, you can use hclust(distance_matrix)$order to extract one based on hierarchical clustering, as long as you can get an appropriate distance matrix. You might need to use as.dist to convert a matrix to a “distance matrix” form.